In [1]:
import pandas as pd
import numpy as np
import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode()
In [2]:
# make up events DataFrame
events = ['Home', 'Cart', 'Product', 'Cancel', 'Purchase', 'Category', 'Brand', 'History']
platforms = ['Andriod', 'iOS', 'PC']



def random_dates(start, end, n):

    start_u = start.value//10**9
    end_u = end.value//10**9

    return pd.to_datetime(np.random.randint(start_u, end_u, n), unit='s')

start = pd.to_datetime('2022-01-01')
end = pd.to_datetime('2022-08-01')


rng = np.random.default_rng()
df = pd.DataFrame(rng.integers(0, 5000, size=(10000, 1)), columns=['user_id'])

df['time'] = random_dates(start, end, n= 10000)
df['event_name'] = np.random.choice(list(events), len(df))
df['paltform'] = np.random.choice(list(platforms), len(df))

df.head()
Out[2]:
user_id time event_name paltform
0 2613 2022-07-11 10:06:07 Brand PC
1 4788 2022-07-17 22:14:09 Cancel PC
2 2729 2022-05-25 06:36:18 Home Andriod
3 3160 2022-05-15 16:10:41 Home Andriod
4 1073 2022-01-30 04:41:28 Purchase PC
In [3]:
def filter_starting_step(x, starting_step, n_steps):
    """
    Function used to return the first n_steps for each user starting from the "starting_step".
    The function will be used to generate the event sequence journey for each user.
    """
    starting_step_index = x.index(starting_step)
    
    return x[starting_step_index: starting_step_index + n_steps] 
In [4]:
import random
#You can also generate random colors if you don't have a list in mind
#number_of_colors = len(events)
#color = ["#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)])
#             for i in range(number_of_colors)]
color_n = ['#55CBCD', '#CBAACB', '#FF968A', '#4DD091', '#FF5768', '#0065A2', '#57838D', '#FFC500']
color_l = ['#D4F0F0', '#ECD5E3', '#FFDBCC', '#E0F8F5', '#FFEFFF', '#9EDDEF', '#D7D2EA', '#FFF7C2'] 
color_dictn = {events[i]: color_n[i] for i in range(len(events))}
color_dictl = {events[i]: color_l[i] for i in range(len(events))}
In [5]:
def user_journey(df, starting_step, n_steps=5):
    
    # sort df by time
    events = df.sort_values(['user_id', 'time'])
    # find the users that have performed the starting_step
    valid_ids = df[df['event_name'] == starting_step]['user_id'].unique()

    # plan out the journey per user, with each step in a separate column
    flow = df[(df['user_id'].isin(valid_ids))] \
        .groupby('user_id') \
        .event_name.agg(list) \
        .to_frame()['event_name'] \
        .apply(lambda x: x[x.index(starting_step): x.index(starting_step) + n_steps] ) \
        .to_frame() \
        ['event_name'].apply(pd.Series).fillna('End')
    
    # add the step number as prefix to each step
    for i, col in enumerate(flow.columns):
        flow[col] = '{}: '.format(i + 1) + flow[col].astype(str)


    # count the number of identical journeys up the max step defined
    flow = flow.groupby(list(range(n_steps))) \
        .size() \
        .to_frame() \
        .rename({0: 'count'}, axis=1) \
        .reset_index()
    
    
    
    # transform flow df into a source-target pair
    cat_cols = flow.columns[:-1].values.tolist()
    for i in range(len(cat_cols) - 1):
        if i == 0:
            source_target_df = flow[[cat_cols[i], cat_cols[i + 1], 'count']]
            source_target_df.columns = ['source', 'target', 'count']
        else:
            temp_df = flow[[cat_cols[i], cat_cols[i + 1], 'count']]
            temp_df.columns = ['source', 'target', 'count']
            source_target_df = pd.concat([source_target_df, temp_df])
        source_target_df = source_target_df.groupby(['source', 'target']).agg({'count': 'sum'}).reset_index()


    # filter out the end step
    source_target_df = source_target_df[(~source_target_df['source'].str.contains('End')) &
                                        (~source_target_df['target'].str.contains('End'))]
    
    
    # create the nodes labels list
    label_target = list(set(source_target_df.target.tolist()))
    label_source = list(set(source_target_df.source.tolist()))
    label_list = list(set(label_target + label_source))
    
    
    # create a list of colours for the nodes
    colors_node = []
    for i in label_list:
        for key, val in color_dictn.items():
            if i.find(key) > 0:
                #print(key, i)
                colors_node.append(val)
                
    # create a list of colours for the links
    colors_link = []
    for i in source_target_df.target.tolist():
        for key, val in color_dictl.items():
            if i.find(key) > 0:
                colors_link.append(val)
    
    # add index for source-target pair
    source_target_df['source_id'] = source_target_df['source'].apply(lambda x: label_list.index(x))
    source_target_df['target_id'] = source_target_df['target'].apply(lambda x: label_list.index(x))

    return label_list, colors_node, colors_link, source_target_df
In [6]:
def plot_user_flow(df, starting_step, n_steps=5, title='Sankey Diagram'):

    label_list, colors_node, colors_link, source_target_df = user_journey(df, starting_step, n_steps)
    
    

    # creating the sankey diagram
    data = dict(
        type='sankey',
        node=dict(
            pad=20,
            thickness=20,
            color=colors_node,
            line=dict(
                color="black",
                width=0.5
            ),
            label=label_list
        ),
        link=dict(
            source=source_target_df['source_id'].values.tolist(),
            target=source_target_df['target_id'].values.tolist(),
            value=source_target_df['count'].astype(int).values.tolist(),
            color = colors_link,
            hoverlabel=dict(
                bgcolor='#C2C4C7')
        )
    )

    # set window width so that steps are evenly spaced out
    if n_steps < 5:
        width = None
    else:
        width = n_steps * 250

    layout = dict(
        height=700,
        width=width,
        margin=dict(t=30, l=0, r=0, b=30),
        title=title,
        font=dict(
            size=16
        )
    )

    fig = dict(data=[data], layout=layout)
    return fig
In [7]:
fig = plot_user_flow(df, starting_step = 'Home', n_steps=4, title='Customer Journey Sankey Diagram')
iplot(fig)
In [ ]:
 
In [ ]: